Dependencies

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(readr)
library(viridis)
## Loading required package: viridisLite
library(e1071)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Data

16x

Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.

load("../data/16x/Coleps_viridis/Morph_mvt.RData")
dd16 <- morph_mvt

load("../data/16x/Coleps_irchel/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Didinium/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Paramecium_bursaria/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Dexiostoma/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Loxocephallus/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../data/16x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd16 <- rbind(dd16, morph_mvt)


## filtering

dd16 <- dd16 %>%
  filter(net_disp>50, net_speed>5)

dd16$magnification <- 1.6
dd16$species <- ifelse(dd16$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),"Smaller_ciliates",dd16$species)

Species compositions

species <- unique(dd16$species)
compositions <- read_csv("../../compositions.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   composition = col_character()
## )
## See spec(...) for full column specifications.
compositions$Smaller_ciliates <- with(compositions, 
                                      ifelse(Tetrahymena == 1| Dexiostoma == 1| Loxocephallus == 1, 1, 0))

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- paste0("c_",c(paste0(0,1:9),10:16))

Classifiers

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

trainingdata <- dd16[complete.cases(dd16), ]

set.seed(624)

trainingdata$species <- factor(trainingdata$species)
split_up <- split(trainingdata, f = trainingdata$species)
subsamples <- lapply(split_up, function(x) {
  x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)

# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
                                 p = 0.65, # percentage of data as training
                                 list = FALSE)

testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
  
obj <- tune(svm, formula, data = trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))




cf <- caret::confusionMatrix(predict(obj$best.model, newdata=testdata %>% select(-species)),
                             testdata$species)

classifier_plot_svm(table = cf$table,
                    combination.nr = "all")

classifier_assessment_plot(cf = cf, combination.nr = "all")

There are mainly 4 measures:

PPV is the most important for us!

formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

set.seed(62)
classifiers_18c_16x <- list()
classifiers_18c_16x_plots <- list()
classifiers_18c_16x_assess_plots <- list()

trainingdata <- dd16[complete.cases(dd16), ]

for(i in 1:length(compositions_list)){
  sub_trainingdata <- trainingdata %>%
    filter(species %in% c(compositions_list[[i]]))
  n.var <- length(unique(sub_trainingdata$species))
  sub_trainingdata$species <- factor(sub_trainingdata$species)

  split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  subsamples <- lapply(split_up, function(x) {
    x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  sub_trainingdata <- do.call("rbind", subsamples)
  # table(sub_trainingdata$species)
  # A stratified random split of the data
  idx_train <- createDataPartition(sub_trainingdata$species,
                                   p = 0.7, # percentage of data as training
                                   list = FALSE)
  
  sub_testdata <- sub_trainingdata[-idx_train,]
  sub_trainingdata <- sub_trainingdata[idx_train,]
  
  n.min <- min(table(sub_trainingdata$species))
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c_16x[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_16x_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_16x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_16x_plots) <- names(classifiers_18c_16x) <- 
  names(classifiers_18c_16x_assess_plots) <- paste0("c_",c(paste0(0,1:9),10:16))
library(patchwork)
for(i in 1:16){
  print(classifiers_18c_16x_plots[[i]] + classifiers_18c_16x_assess_plots[[i]] + 
    plot_layout(widths = c(4,2)))
}

Save classifiers

saveRDS(classifiers_18c_16x, "svm_video_classifiers_18c_16x.rds")
# cl <- readRDS("classifiers_18c_16x.rds")